load necessary packages for the script

  library(tidyverse)
  library(ggplot2)
  library(readxl)
  library(janitor)
  library(tidyr)
  library(dplyr)
  library(lubridate)

Problem_1 Read and clean the Mr. Trash Wheel sheet:

specify the sheet in the Excel file and to omit non-data entries (rows with notes / figures; columns containing notes) using arguments in read_excel use reasonable variable names omit rows that do not include dumpster-specific data round the number of sports balls to the nearest integer Read and clean precipitation data for 2018 and 2019. For each, omit rows without precipitation data and add a variable for year. Next, combine precipitation datasets and convert month to a character variable (the variable month.name is built into R and should be useful).

Write a paragraph about these data; you are encouraged to use inline R. Be sure to note the number of observations in both resulting datasets, and give examples of key variables. For available data, what was the total precipitation in 2018? What was the median number of sports balls in a dumpster in 2019?

Problem 1

# I decided to set echo to true, so you can see both my code and the results in
# the knitted HTML file.

# Use the read_excel function to read in the Mr. Trash Wheel sheet dataset. skip 1st row which contains an image
trash_data = read_excel("./data/Trash-Wheel-Collection-Totals-7-2020-2.xlsx", skip =1, na = "")
## New names:
## * `` -> ...15
## * `` -> ...16
## * `` -> ...17
#let's look at a quick glimpse of the data
head(trash_data, 5)
## # A tibble: 5 x 17
##   Dumpster Month  Year Date                `Weight (tons)` `Volume (cubic yards…
##   <chr>    <chr> <dbl> <dttm>                        <dbl>                 <dbl>
## 1 1        May    2014 2014-05-16 00:00:00            4.31                    18
## 2 2        May    2014 2014-05-16 00:00:00            2.74                    13
## 3 3        May    2014 2014-05-16 00:00:00            3.45                    15
## 4 4        May    2014 2014-05-17 00:00:00            3.1                     15
## 5 5        May    2014 2014-05-17 00:00:00            4.06                    18
## # … with 11 more variables: Plastic Bottles <dbl>, Polystyrene <dbl>,
## #   Cigarette Butts <dbl>, Glass Bottles <dbl>, Grocery Bags <dbl>,
## #   Chip Bags <dbl>, Sports Balls <dbl>, Homes Powered* <dbl>, ...15 <chr>,
## #   ...16 <lgl>, ...17 <lgl>
#names of variables
names(trash_data)
##  [1] "Dumpster"             "Month"                "Year"                
##  [4] "Date"                 "Weight (tons)"        "Volume (cubic yards)"
##  [7] "Plastic Bottles"      "Polystyrene"          "Cigarette Butts"     
## [10] "Glass Bottles"        "Grocery Bags"         "Chip Bags"           
## [13] "Sports Balls"         "Homes Powered*"       "...15"               
## [16] "...16"                "...17"
#let's convert all column titles to snake_case
trash_data = janitor::clean_names(trash_data)
names(trash_data)
##  [1] "dumpster"           "month"              "year"              
##  [4] "date"               "weight_tons"        "volume_cubic_yards"
##  [7] "plastic_bottles"    "polystyrene"        "cigarette_butts"   
## [10] "glass_bottles"      "grocery_bags"       "chip_bags"         
## [13] "sports_balls"       "homes_powered"      "x15"               
## [16] "x16"                "x17"
#let's get rid of rows with non dumpster-specific data (aka ones that contain NA)
#trash_data %>% drop_na()
#head(trash_data, 10)

#I wasn't able to use drop_na() successful, so I found a work around that copies the dataframe and sets those non specific dumpster
# values to NA across the board
#trash_data %>% drop_na()
#trash_data <- trash_data[trash_data$Dumpster != "NA", ]
trash_data <- trash_data[1:(length(trash_data)-3)] #drop last 3 columns
trash_data <- na.omit(trash_data) 


#round sports ball to nearest integer
trash_data$sports_balls = round(trash_data$sports_balls)

#read and clean precipitation data - skip first row
precipitation19_data = read_excel("./data/Trash-Wheel-Collection-Totals-7-2020-2.xlsx", skip =1, na = "", sheet = 6)
precipitation18_data = read_excel("./data/Trash-Wheel-Collection-Totals-7-2020-2.xlsx", skip =1, na = "", sheet = 7)

#add a variable for year
precipitation19_data$Year <- "2019"
precipitation18_data$Year <- "2018"

#clean up data
precipitation19_data <- precipitation19_data[precipitation19_data$Month != "NA", ]
precipitation18_data <- precipitation18_data[precipitation18_data$Month != "NA", ]

#get rid of NA row 13
precipitation19_data <- precipitation19_data[c(1:12), ]
precipitation18_data <- precipitation18_data[c(1:12), ]


#join the datasets
precipitation_data = 
  full_join(precipitation18_data, precipitation19_data)
## Joining, by = c("Month", "Total", "Year")
#convert month to character month (ie 1 to January)
precipitation_data$Month <- month.name[precipitation_data$Month]

#lets talk about the data with descriptive statistics
summary(trash_data)
##    dumpster            month                year     
##  Length:453         Length:453         Min.   :2014  
##  Class :character   Class :character   1st Qu.:2015  
##  Mode  :character   Mode  :character   Median :2018  
##                                        Mean   :2017  
##                                        3rd Qu.:2019  
##                                        Max.   :2021  
##       date                      weight_tons   volume_cubic_yards
##  Min.   :1900-01-20 00:00:00   Min.   :0.78   Min.   : 7.00     
##  1st Qu.:2015-12-26 00:00:00   1st Qu.:2.72   1st Qu.:15.00     
##  Median :2018-02-05 00:00:00   Median :3.19   Median :15.00     
##  Mean   :2017-07-03 05:36:57   Mean   :3.20   Mean   :15.41     
##  3rd Qu.:2019-05-28 00:00:00   3rd Qu.:3.68   3rd Qu.:15.00     
##  Max.   :2021-01-04 00:00:00   Max.   :5.62   Max.   :20.00     
##  plastic_bottles  polystyrene   cigarette_butts  glass_bottles   
##  Min.   : 210    Min.   : 210   Min.   :   980   Min.   :  0.00  
##  1st Qu.: 980    1st Qu.: 950   1st Qu.:  5000   1st Qu.:  9.00  
##  Median :1850    Median :1650   Median : 11000   Median : 18.00  
##  Mean   :1899    Mean   :1921   Mean   : 24522   Mean   : 22.45  
##  3rd Qu.:2640    3rd Qu.:2730   3rd Qu.: 32000   3rd Qu.: 32.00  
##  Max.   :5960    Max.   :6540   Max.   :310000   Max.   :110.00  
##   grocery_bags    chip_bags     sports_balls   homes_powered  
##  Min.   :  50   Min.   : 180   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 425   1st Qu.: 800   1st Qu.: 5.00   1st Qu.:38.83  
##  Median : 820   Median :1327   Median : 9.00   Median :51.17  
##  Mean   :1104   Mean   :1558   Mean   :11.75   Mean   :45.32  
##  3rd Qu.:1620   3rd Qu.:2150   3rd Qu.:16.00   3rd Qu.:58.67  
##  Max.   :3750   Max.   :5085   Max.   :56.00   Max.   :93.67
total_precipitation_2018 <- sum(precipitation18_data$Total)
#70.83

median(trash_data$sports_balls)
## [1] 9
#11

trash_data %>%
  group_by(year) %>%
  summarise_at(vars(sports_balls), list(name = mean))
## # A tibble: 8 x 2
##    year  name
##   <dbl> <dbl>
## 1  2014  6.05
## 2  2015 17.4 
## 3  2016 25.7 
## 4  2017  9.64
## 5  2018  4.82
## 6  2019 10.0 
## 7  2020 11.9 
## 8  2021 12
#2019 had a median of 10.030303, while the total median over the years was 11.

#the precipitation_data has 24 observations of 3 variables, which are Month, Total (total rainfall), and Year. the trash_data dataset has 533 observations of 17 variables (although this is including NA observations, which come out to 80, so there are really 453 obervations). Key variables include dumpster # (for id), month, year, as well as the types of trash picked up, like glass_bottles, grocery_bags, and cigarette_butts. the total precipitation in 2018 was 70.33 inches, and the median number of sports balls in a dumpster in 2019 is 10.030303 sports balls. 
summary(trash_data)
##    dumpster            month                year     
##  Length:453         Length:453         Min.   :2014  
##  Class :character   Class :character   1st Qu.:2015  
##  Mode  :character   Mode  :character   Median :2018  
##                                        Mean   :2017  
##                                        3rd Qu.:2019  
##                                        Max.   :2021  
##       date                      weight_tons   volume_cubic_yards
##  Min.   :1900-01-20 00:00:00   Min.   :0.78   Min.   : 7.00     
##  1st Qu.:2015-12-26 00:00:00   1st Qu.:2.72   1st Qu.:15.00     
##  Median :2018-02-05 00:00:00   Median :3.19   Median :15.00     
##  Mean   :2017-07-03 05:36:57   Mean   :3.20   Mean   :15.41     
##  3rd Qu.:2019-05-28 00:00:00   3rd Qu.:3.68   3rd Qu.:15.00     
##  Max.   :2021-01-04 00:00:00   Max.   :5.62   Max.   :20.00     
##  plastic_bottles  polystyrene   cigarette_butts  glass_bottles   
##  Min.   : 210    Min.   : 210   Min.   :   980   Min.   :  0.00  
##  1st Qu.: 980    1st Qu.: 950   1st Qu.:  5000   1st Qu.:  9.00  
##  Median :1850    Median :1650   Median : 11000   Median : 18.00  
##  Mean   :1899    Mean   :1921   Mean   : 24522   Mean   : 22.45  
##  3rd Qu.:2640    3rd Qu.:2730   3rd Qu.: 32000   3rd Qu.: 32.00  
##  Max.   :5960    Max.   :6540   Max.   :310000   Max.   :110.00  
##   grocery_bags    chip_bags     sports_balls   homes_powered  
##  Min.   :  50   Min.   : 180   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 425   1st Qu.: 800   1st Qu.: 5.00   1st Qu.:38.83  
##  Median : 820   Median :1327   Median : 9.00   Median :51.17  
##  Mean   :1104   Mean   :1558   Mean   :11.75   Mean   :45.32  
##  3rd Qu.:1620   3rd Qu.:2150   3rd Qu.:16.00   3rd Qu.:58.67  
##  Max.   :3750   Max.   :5085   Max.   :56.00   Max.   :93.67
head(trash_data)
## # A tibble: 6 x 14
##   dumpster month  year date                weight_tons volume_cubic_yards
##   <chr>    <chr> <dbl> <dttm>                    <dbl>              <dbl>
## 1 1        May    2014 2014-05-16 00:00:00        4.31                 18
## 2 2        May    2014 2014-05-16 00:00:00        2.74                 13
## 3 3        May    2014 2014-05-16 00:00:00        3.45                 15
## 4 4        May    2014 2014-05-17 00:00:00        3.1                  15
## 5 5        May    2014 2014-05-17 00:00:00        4.06                 18
## 6 6        May    2014 2014-05-20 00:00:00        2.71                 13
## # … with 8 more variables: plastic_bottles <dbl>, polystyrene <dbl>,
## #   cigarette_butts <dbl>, glass_bottles <dbl>, grocery_bags <dbl>,
## #   chip_bags <dbl>, sports_balls <dbl>, homes_powered <dbl>
head(precipitation_data)
## # A tibble: 6 x 3
##   Month    Total Year 
##   <chr>    <dbl> <chr>
## 1 January   0.94 2018 
## 2 February  4.8  2018 
## 3 March     2.69 2018 
## 4 April     4.69 2018 
## 5 May       9.27 2018 
## 6 June      4.77 2018

Problem 2

This problem uses the FiveThirtyEight data; these data were gathered to create the interactive graphic on this page. In particular, we’ll use the data in pols-month.csv, unemployment.csv, and snp.csv. Our goal is to merge these into a single data frame using year and month as keys across datasets.

First, clean the data in pols-month.csv. Use separate() to break up the variable mon into integer variables year, month, and day; replace month number with month name; create a president variable taking values gop and dem, and remove prez_dem and prez_gop; and remove the day variable.

Second, clean the data in snp.csv using a similar process to the above. For consistency across datasets, arrange according to year and month, and organize so that year and month are the leading columns.

Third, tidy the unemployment data so that it can be merged with the previous datasets. This process will involve switching from “wide” to “long” format; ensuring that key variables have the same name; and ensuring that key variables take the same values.

Join the datasets by merging snp into pols, and merging unemployment into the result.

Write a short paragraph about these datasets. Explain briefly what each dataset contained, and describe the resulting dataset (e.g. give the dimension, range of years, and names of key variables).

Note: we could have used a date variable as a key instead of creating year and month keys; doing so would help with some kinds of plotting, and be a more accurate representation of the data. Date formats are tricky, though. For more information check out the lubridate package in the tidyverse.

Problem 2

#read the csv data in, clean the names into lower snake case, and seperate the mon variable into year, month and day.
pols_data = read_csv(file = "./data/fivethirtyeight_datasets/pols-month.csv") %>%
  janitor::clean_names() %>%
  separate(mon, into = c("year", "month", "day"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   mon = col_date(format = ""),
##   prez_gop = col_double(),
##   gov_gop = col_double(),
##   sen_gop = col_double(),
##   rep_gop = col_double(),
##   prez_dem = col_double(),
##   gov_dem = col_double(),
##   sen_dem = col_double(),
##   rep_dem = col_double()
## )
typeof(pols_data$month)
## [1] "character"
#convert month to character month (ie 1 to January)
pols_data$month <- as.numeric(pols_data$month)
pols_data$month <- month.name[pols_data$month]

typeof(pols_data$prez_gop)
## [1] "double"
pols_data$prez_gop <- as.character(pols_data$prez_gop)
#create a president variable taking values gop and dem
#pols_data  %>% mutate(
#    prez_gop = replace(prez_gop, gop == "0", "gop"))

#pols_data  %>% mutate(
#    prez_gop = recode(prez_gop, `0` = "0", `1` = "gop"))

#pols_data  %>% recode(prez_gop, `0` = "0", `1` = "gop")

#pols_data$prez_gop[prez_gop == 1] <- "gop"

pols_data$prez_gop=ifelse(pols_data$prez_gop==1, "gop","dem")

pols_data$president <- pols_data$prez_gop

#drop day, prez_dem and prez_gop variables
pols_data = subset(pols_data, select = c(year, month, president))

#get rid of 19 and 20 in year for pols_data, so year format matches snp
pols_data$year <- substring(pols_data$year, 3)


#Second, clean the data in snp.csv using a similar process to the above. For consistency across datasets, arrange according to year and month, and organize so that year and month are the leading columns.
snp_data = read_csv(file = "./data/fivethirtyeight_datasets/snp.csv") %>%
  janitor::clean_names() %>%
  separate(date, into = c("month", "day", "year")) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   date = col_character(),
##   close = col_double()
## )
#convert month to character month (ie 1 to January)
snp_data$month <- as.numeric(snp_data$month)
snp_data$month <- month.name[snp_data$month]

#reorder columns
snp_data <- snp_data[, c(3,1,2, 4)]

#get rid of day variable in snp_data
snp_data = subset(snp_data, select = c(year, month, close))

#tidy unemployment_data, converting it from wide to long format, ensuring key variables are similarly named, and have similar values

unemployment_data = read_csv(file = "./data/fivethirtyeight_datasets/unemployment.csv") %>%
  janitor::clean_names() 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   Year = col_double(),
##   Jan = col_double(),
##   Feb = col_double(),
##   Mar = col_double(),
##   Apr = col_double(),
##   May = col_double(),
##   Jun = col_double(),
##   Jul = col_double(),
##   Aug = col_double(),
##   Sep = col_double(),
##   Oct = col_double(),
##   Nov = col_double(),
##   Dec = col_double()
## )
unemployment_data <- pivot_longer(
    unemployment_data, 
    jan:dec,
    names_to = "month", 
    values_to = "unemployment")

#get rid of 19 and 20 in year for pols_data, so year format matches snp and pols
unemployment_data$year <- substring(unemployment_data$year, 3)

#rename unemployment_data month variable to match snp and pols data (ie feb -> February)
#unemployment_data$month <- match(unemployment_data$month,month.abb)
#unemployment_data$month <- month.abb[unemployment_data$month]
#unemployment_data$month <- month.name[unemployment_data$month]
#couldn't do it by month.abb, so doing it manually

unemployment_data$month[unemployment_data$month=="jan" ] <- "January"
unemployment_data$month[unemployment_data$month=="feb" ] <- "February"
unemployment_data$month[unemployment_data$month=="mar" ] <- "March"
unemployment_data$month[unemployment_data$month=="apr" ] <- "April"
unemployment_data$month[unemployment_data$month=="may" ] <- "May"
unemployment_data$month[unemployment_data$month=="jun" ] <- "June"
unemployment_data$month[unemployment_data$month=="jul" ] <- "July"
unemployment_data$month[unemployment_data$month=="aug" ] <- "August"
unemployment_data$month[unemployment_data$month=="sep" ] <- "September"
unemployment_data$month[unemployment_data$month=="oct" ] <- "October"
unemployment_data$month[unemployment_data$month=="nov" ] <- "November"
unemployment_data$month[unemployment_data$month=="dec" ] <- "December"


#Join the datasets by merging snp into pols, and merging unemployment into the result.
#join the datasets
merged_political_data = 
  full_join(pols_data, snp_data)
## Joining, by = c("year", "month")
#note there is no close data before 1950

merged_political_data =
  full_join(merged_political_data, unemployment_data)
## Joining, by = c("year", "month")
tail(merged_political_data, 20) #showing a lot because there isn't closing price and unemployment data early on
## # A tibble: 20 x 5
##    year  month     president close unemployment
##    <chr> <chr>     <chr>     <dbl>        <dbl>
##  1 14    May       dem       1924.          6.3
##  2 14    June      dem       1960.          6.1
##  3 14    July      dem       1931.          6.2
##  4 14    August    dem       2003.          6.1
##  5 14    September dem       1972.          5.9
##  6 14    October   dem       2018.          5.7
##  7 14    November  dem       2068.          5.8
##  8 14    December  dem       2059.          5.6
##  9 15    January   dem       1995.          5.7
## 10 15    February  dem       2104.          5.5
## 11 15    March     dem       2068.          5.5
## 12 15    April     dem       2086.          5.4
## 13 15    May       dem       2107.          5.5
## 14 15    June      dem       2063.          5.3
## 15 15    July      <NA>      2080.         NA  
## 16 15    August    <NA>        NA          NA  
## 17 15    September <NA>        NA          NA  
## 18 15    October   <NA>        NA          NA  
## 19 15    November  <NA>        NA          NA  
## 20 15    December  <NA>        NA          NA
#the snp dataset contianed information on the closing value of Standard & Poor’s stock market index (S&P), an economic indicator, and a corresponding date. the pols dataset contained information about the prevailing political party on a certain date (ie whether there was a democratic president in 1954). The unemployment dataset tracked unemployment rates since 1948. There is a slight disconnect between datasets, where unemployment was only tracked in 1948, and S&P in 1950, so some values are missing from before then. I decided to remove irrelevant information from pols_data, like the number of democratic governors, because they could get in the way of interpreting results unless analyzed with an understanding of policital precedence (ie a certain amount of senators of the same party are needed to pass bills alongside the president). The resulting dataset, merged_political_data, had 828 observations of 5 variables, year, month, president (their corresponding presidential party), closing price of S&P, and unemployment rates. The range of years goes from 1947 to 2015, although as noted before, S&P and unemployment aren't tracked until 1950 and 1948, respectively. 

Problem 3

This problem uses data from NYC Open data on the popularity of baby names, and can be downloaded here.

Load and tidy the data. Note that, although these data may seem fairly well formatted initially, the names of a categorical predictor and the case structure of string variables changed over time; you’ll need to address this in your data cleaning. Also, some rows seem duplicated, and these will need to be removed (hint: google something like “dplyr remove duplicate rows” to get started).

Produce a well-structured, reader-friendly table showing the rank in popularity of the name “Olivia” as a female baby name over time; this should have rows for ethnicities and columns for year. Produce a similar table showing the most popular name among male children over time.

Finally, for male, white non-hispanic children born in 2016, produce a scatter plot showing the number of children with a name (y axis) against the rank in popularity of that name (x axis).

babynames = read_csv(file = "./data/Popular_Baby_Names.csv") %>%
  janitor::clean_names()
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `Year of Birth` = col_double(),
##   Gender = col_character(),
##   Ethnicity = col_character(),
##   `Child's First Name` = col_character(),
##   Count = col_double(),
##   Rank = col_double()
## )
categories <- unique(babynames$ethnicity) 
categories
## [1] "ASIAN AND PACIFIC ISLANDER" "BLACK NON HISPANIC"        
## [3] "HISPANIC"                   "WHITE NON HISPANIC"        
## [5] "ASIAN AND PACI"             "BLACK NON HISP"            
## [7] "WHITE NON HISP"
#here you can see that "Asian and Pacific Islander" becomes "Asian and Paci", "Black Non Hispanic becomes "Black Non Hisp", and "White Non Hispanic" becomes "White Non Hisp". Let's fix that.
babynames$ethnicity[babynames$ethnicity=="ASIAN AND PACIFIC ISLANDER" ] <- "ASIAN AND PACI"
babynames$ethnicity[babynames$ethnicity=="BLACK NON HISPANIC" ] <- "BLACK NON HISP"
babynames$ethnicity[babynames$ethnicity=="WHITE NON HISPANIC" ] <- "WHITE NON HISP"

categories <- unique(babynames$ethnicity) 
categories
## [1] "ASIAN AND PACI" "BLACK NON HISP" "HISPANIC"       "WHITE NON HISP"
#now we only have 4 categories, "ASIAN AND PACI". "BLACK NON HISP", "HISPANIC", "WHITE NON HISP"

#remove duplicate rows using dplyr
babynames <- unique(babynames)

#make sure case is normalized (lower), this messes with accent marks in names, so removed
#tolower(babynames$childs_first_name)

#create a table

#filter so the only name is Olivia, and create a table showing the rank in popularity of "Olivia", with rows for ethnicity and columns for year
olivia <- filter(babynames, childs_first_name == "Olivia")
olivia = 
  pivot_wider(
    olivia, 
    ethnicity,
    names_from = "year_of_birth", 
    values_from = "rank"
    )

#now lets do this for the mostpopular name among male children over time
malebabynames <- filter(babynames, gender == "MALE")
malebabynames <- filter(malebabynames, rank == "1")
malebabynames = 
  pivot_wider(
    malebabynames, 
    ethnicity,
    names_from = "year_of_birth", 
    values_from = "childs_first_name"
    )
#this chart shows that the most popular name by year 

whitenonhispanic2016 <- filter(babynames, ethnicity == "WHITE NON HISP")
whitenonhispanic2016 <- filter(whitenonhispanic2016, year_of_birth == "2016")
whitenonhispanic2016 <- filter(whitenonhispanic2016, gender == "MALE")

# generate a scatterplot of rank (x) vs name (y), with 
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
whitenonhispanic2016_plot = ggplot(data = whitenonhispanic2016, aes(x = rank, y = childs_first_name)) + geom_point() + ggtitle("Name vs Rank of Non-Hispanic Children born in 2016") + xlab("rank (1 to 100)") + ylab("name") +
  theme(axis.text.y=element_blank())
int <- plotly_build(whitenonhispanic2016_plot) 
int
# save the scatterplot
ggsave("nonhispanicwhite2016_scatter_plot.pdf", height = 4, width = 6)
head(babynames)
## # A tibble: 6 x 6
##   year_of_birth gender ethnicity      childs_first_name count  rank
##           <dbl> <chr>  <chr>          <chr>             <dbl> <dbl>
## 1          2016 FEMALE ASIAN AND PACI Olivia              172     1
## 2          2016 FEMALE ASIAN AND PACI Chloe               112     2
## 3          2016 FEMALE ASIAN AND PACI Sophia              104     3
## 4          2016 FEMALE ASIAN AND PACI Emily                99     4
## 5          2016 FEMALE ASIAN AND PACI Emma                 99     4
## 6          2016 FEMALE ASIAN AND PACI Mia                  79     5
head(olivia)
## # A tibble: 4 x 5
##   ethnicity      `2016` `2015` `2014` `2013`
##   <chr>           <dbl>  <dbl>  <dbl>  <dbl>
## 1 ASIAN AND PACI      1      1      1      3
## 2 BLACK NON HISP      8      4      8      6
## 3 HISPANIC           13     16     16     22
## 4 WHITE NON HISP      1      1      1      1
head(whitenonhispanic2016)
## # A tibble: 6 x 6
##   year_of_birth gender ethnicity      childs_first_name count  rank
##           <dbl> <chr>  <chr>          <chr>             <dbl> <dbl>
## 1          2016 MALE   WHITE NON HISP Joseph              261     1
## 2          2016 MALE   WHITE NON HISP Michael             260     2
## 3          2016 MALE   WHITE NON HISP David               255     3
## 4          2016 MALE   WHITE NON HISP Moshe               239     4
## 5          2016 MALE   WHITE NON HISP Jacob               236     5
## 6          2016 MALE   WHITE NON HISP James               231     6
head(malebabynames)
## # A tibble: 4 x 7
##   ethnicity      `2016` `2015` `2014` `2013` `2012` `2011` 
##   <chr>          <chr>  <chr>  <chr>  <chr>  <chr>  <chr>  
## 1 ASIAN AND PACI Ethan  Jayden Jayden Jayden RYAN   ETHAN  
## 2 BLACK NON HISP Noah   Noah   Ethan  Ethan  JAYDEN JAYDEN 
## 3 HISPANIC       Liam   Liam   Liam   Jayden JAYDEN JAYDEN 
## 4 WHITE NON HISP Joseph David  Joseph David  JOSEPH MICHAEL